Loading in necessary libraries

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

Matching TCGA and GTEX samples

GTEX_file<-"/media/theron/My_Passport/TCGA_junctions/ext_dat/GTEx_recount3_selection_2021-09-14_17_25_41.csv"
TCGA_file<-"/media/theron/My_Passport/TCGA_junctions/ext_dat/TCGA_recount3_selection_2021-09-14_17_27_07.csv"
GTEX_samples <- read.table(GTEX_file,sep=",",header=T)
TCGA_samples <- read.table(TCGA_file,sep=",",header=T)
all_samples<-rbind(GTEX_samples,TCGA_samples)

STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","LCML","COAD","CNTL","ESCA","FPPP","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","MISC","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
STUDY_NAME<-c("Acute Myeloid Leukemia","Adrenocortical carcinoma","Bladder Urothelial Carcinoma","Brain Lower Grade Glioma","Breast invasive carcinoma","Cervical squamous cell carcinoma and endocervical adenocarcinoma","Cholangiocarcinoma","Chronic Myelogenous Leukemia","Colon adenocarcinoma","Controls","Esophageal carcinoma","FFPE Pilot Phase II","Glioblastoma multiforme","Head and Neck squamous cell carcinoma","Kidney Chromophobe","Kidney renal clear cell carcinoma","Kidney renal papillary cell carcinoma","Liver hepatocellular carcinoma","Lung adenocarcinoma","Lung squamous cell carcinoma","Lymphoid Neoplasm Diffuse Large B-cell Lymphoma","Mesothelioma","Miscellaneous","Ovarian serous cystadenocarcinoma","Pancreatic adenocarcinoma","Pheochromocytoma and Paraganglioma","Prostate adenocarcinoma","Rectum adenocarcinoma","Sarcoma","Skin Cutaneous Melanoma","Stomach adenocarcinoma","Testicular Germ Cell Tumors","Thymoma","Thyroid carcinoma","Uterine Carcinosarcoma","Uterine Corpus Endometrial Carcinoma","Uveal Melanoma")

STUDY_ABREVIATION_NAMES=data.frame(STUDY_ABREVIATION,STUDY_NAME)
colnames(STUDY_ABREVIATION_NAMES)<-c("ABREVIATION","NAME")

all_samples$DESC <- unlist(lapply(all_samples$project,function(proj){
  a<-which(STUDY_ABREVIATION_NAMES$ABREVIATION==proj)
  if (length(a)==0){
    return("")
  } else {
    return(STUDY_ABREVIATION_NAMES$NAME[a])
  }
}))

write.table(data.frame(all_samples$project),
            file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/projects.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)

Building TCGA-GTEX map

STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","LCML","COAD","ESCA","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
GTEX_map<-c("BLOOD,BONE_MARROW","ADRENAL_GLAND","BLADDER","BRAIN","BREAST","CERVIX_UTERI","LIVER","BLOOD","COLON","ESOPHAGUS","BRAIN","ESOPHAGUS,SALIVARY_GLAND","KIDNEY","KIDNEY","KIDNEY","LIVER","LUNG","LUNG","BLOOD","LUNG","FALLOPIAN_TUBE","PANCREAS","ADRENAL_GLAND","PROSTATE","COLON","ADIPOSE_TISSUE","SKIN","STOMACH","TESTIS","?","THYROID","UTERUS","UTERUS","?")
TCGA_GTEX_map<-data.frame(STUDY_ABREVIATION,GTEX_map)
colnames(TCGA_GTEX_map)<-c("TCGA","GTEX")
write.table(TCGA_GTEX_map,
            file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_GTEX_map.txt",
            sep="\t",
            quote=F,
            col.names=T,
            row.names=F)

Building the recount3 metadata file; TCGA

STUDY_ABREVIATION<-c("LAML","ACC","BLCA","LGG","BRCA","CESC","CHOL","COAD","ESCA","GBM","HNSC","KICH","KIRC","KIRP","LIHC","LUAD","LUSC","DLBC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THYM","THCA","UCS","UCEC","UVM")
TCGA_dir <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers"

for (i in seq(length(STUDY_ABREVIATION))){
  if (i == 1){
    all_metadata <- readRDS(sprintf("%s/%s/%s_metadata.rds",TCGA_dir,STUDY_ABREVIATION[i],STUDY_ABREVIATION[i]))
  }
  fill <- readRDS(sprintf("%s/%s/%s_metadata.rds",TCGA_dir,STUDY_ABREVIATION[i],STUDY_ABREVIATION[i]))
  all_metadata<-rbind(all_metadata,fill)
}
saveRDS(all_metadata,file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_recount3_metadata.rds")

Building the recount3 metadata file; GTEx

GTEX_dir <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX"
human_projects <- available_projects()
## 2021-10-14 10:40:09 caching file sra.recount_project.MD.gz.
## 2021-10-14 10:40:11 caching file gtex.recount_project.MD.gz.
## 2021-10-14 10:40:13 caching file tcga.recount_project.MD.gz.
proj_info <- subset(
  human_projects,
  file_source %in% c("gtex")
)
GTEX_STUDIES<-proj_info$project

for (i in seq(length(GTEX_STUDIES))){
  if (i == 1){
    all_metadata <- readRDS(sprintf("%s/%s/%s_metadata.rds",GTEX_dir,GTEX_STUDIES[i],GTEX_STUDIES[i]))
  }
  fill <- readRDS(sprintf("%s/%s/%s_metadata.rds",GTEX_dir,GTEX_STUDIES[i],GTEX_STUDIES[i]))
  all_metadata<-rbind(all_metadata,fill)
}
saveRDS(all_metadata,file="/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX_recount3_metadata.rds")

Analyzing recount3 metadata

TCGA_metadata<-readRDS("/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/TCGA_recount3_metadata.rds")
GTEX_metadata<-readRDS("/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/GTEX_recount3_metadata.rds")

Processing TCGA mutation data

mut_dat <- read_excel("/media/theron/My_Passport/TCGA_junctions/ext_dat/13073_2017_424_MOESM3_ESM_coded.xlsx")
mut_dat_complete<-mut_dat[complete.cases(mut_dat),]

The TCGA MAF summary file

maf_file <- "/media/theron/My_Passport/TCGA_junctions/maf_summary.txt"
mc3_maf = read.table(maf_file,header=T)
mc3_maf$Tumor_Sample_ID <- vapply(TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,sample=T),
                                  function(val){substr(val,1,nchar(val)-1)},
                                  character(1))
rownames(mc3_maf) <- mc3_maf$Tumor_Sample_Barcode
mc3_maf$participant_ID <- TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,participant=T)

Accumulating mutation data per sample

junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()

optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
  a<-strsplit(ID,"-")[[1]]
    if (nchar(a[1]) > 4){
      return(F)  
    } else {
      return(T)
    }
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}

  tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
  tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
  tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
  tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
  mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
  mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
  mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
    type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
    if (type <= 9){
      return("T")
    } else if (type > 9 & type <= 19){
      return ("N")
    } else {
      return ("C")
    }
  },character(1))
  mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
  mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  
  summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
  summ_file <- summ_file$V1
  per_sample_summ <- c()
  for (file in summ_file){
    summ <- read.table(file)
    per_sample_summ <- c(per_sample_summ,summ$V1)
  }
  per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
  colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
  per_sample_summ <- unique(per_sample_summ)
  
  rownames(mc3_maf_small) <- mc3_maf_small$Tumor_Sample_ID
  per_sample_summ$TMB <- mc3_maf_small[per_sample_summ$sample_id,"TMB"]
  
  per_sample_summ <- per_sample_summ %>% dplyr::filter(type == "T" & per_sample_summ > 0)
  
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm"] <- median(per_sample_summ$per_sample_summ)
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(per_sample_summ$TMB)

  print(ggplot(per_sample_summ,aes(x=log10(per_sample_summ),y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
}
## [1] "1 out of 19"
## [1] "BLCA"
## `geom_smooth()` using formula 'y ~ x'

## [1] "2 out of 19"
## [1] "BRCA"
## Warning: Removed 77 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).

## [1] "3 out of 19"
## [1] "CHOL"
## `geom_smooth()` using formula 'y ~ x'

## [1] "4 out of 19"
## [1] "COAD"
## `geom_smooth()` using formula 'y ~ x'

## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## `geom_smooth()` using formula 'y ~ x'

## [1] "8 out of 19"
## [1] "KICH"
## `geom_smooth()` using formula 'y ~ x'

## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## `geom_smooth()` using formula 'y ~ x'

## [1] "11 out of 19"
## [1] "LIHC"
## `geom_smooth()` using formula 'y ~ x'

## [1] "12 out of 19"
## [1] "LUAD"
## `geom_smooth()` using formula 'y ~ x'

## [1] "13 out of 19"
## [1] "LUSC"
## `geom_smooth()` using formula 'y ~ x'

## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## `geom_smooth()` using formula 'y ~ x'

## [1] "17 out of 19"
## [1] "READ"
## `geom_smooth()` using formula 'y ~ x'

## [1] "18 out of 19"
## [1] "THCA"
## `geom_smooth()` using formula 'y ~ x'

## [1] "19 out of 19"
## [1] "UCEC"
## `geom_smooth()` using formula 'y ~ x'

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman")+
  xlab("splicing antigenicity")
## Warning: Removed 6 rows containing non-finite values (stat_cor).
## Warning: Removed 6 rows containing missing values (geom_text).

Analyzing Splicing Factor Mutations per cancer type

splicing_factor_genes<-read_excel("/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes1.xlsx")
splicing_factor_genes<-toupper(splicing_factor_genes$Gene)
write.table(data.frame(splicing_factor_genes),
            file="/media/theron/My_Passport/TCGA_junctions/ext_dat/splicing_factor_genes.txt",
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
splice_maf<-read.maf("/media/theron/My_Passport/TCGA_junctions/splice_factor.maf")
## -Reading
## -Validating
## --Removed 36209 duplicated variants
## -Silent variants: 214408 
## -Summarizing
## --Possible FLAGS among top ten genes:
##   USH2A
##   OBSCN
##   HMCN1
## -Processing clinical data
## --Missing clinical data
## -Finished in 00:01:18 elapsed (00:01:39 cpu)
splice_maf_samp<-getSampleSummary(splice_maf)
splice_maf_samp$Tumor_Sample_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),sample=T)
splice_maf_samp$participant_ID <- TCGAbarcode(as.character(splice_maf_samp$Tumor_Sample_Barcode),participant=T)
rownames(splice_maf_samp)<-splice_maf_samp$Tumor_Sample_Barcode
splice_maf_samp<-data.frame(splice_maf_samp)

Accumulating splicing factor mutation data per sample

junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()

optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
  a<-strsplit(ID,"-")[[1]]
    if (nchar(a[1]) > 4){
      return(F)  
    } else {
      return(T)
    }
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}

  tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
  tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
  tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
  tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
  splice_maf_samp_small<-subset(splice_maf_samp,participant_ID %in% tumor_meta$participant_ID)
  splice_maf_samp_small <- splice_maf_samp_small[complete.cases(splice_maf_samp_small),]
  rownames(splice_maf_samp_small)<-as.character(splice_maf_samp_small$Tumor_Sample_Barcode)
  splice_maf_samp_small$type <- vapply(rownames(splice_maf_samp_small),function(barcode){
    type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
    if (type <= 9){
      return("T")
    } else if (type > 9 & type <= 19){
      return ("N")
    } else {
      return ("C")
    }
  },character(1))
  splice_maf_samp_small$TMB<-log10(splice_maf_samp_small$total+1)
  splice_maf_samp_small <- splice_maf_samp_small %>% dplyr::filter(type == "T")

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  
  summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
  summ_file <- summ_file$V1
  per_sample_summ <- c()
  for (file in summ_file){
    summ <- read.table(file)
    per_sample_summ <- c(per_sample_summ,summ$V1)
  }
  per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
  colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
  per_sample_summ <- unique(per_sample_summ)
  
  
  rownames(splice_maf_samp_small) <- vapply(splice_maf_samp_small$Tumor_Sample_ID,function(val){substr(val,1,nchar(val)-1)},character(1))
  per_sample_summ$TMB <- splice_maf_samp_small[per_sample_summ$sample_id,"TMB"]
  
  per_sample_summ <- per_sample_summ %>% dplyr::filter(type == "T" & per_sample_summ > 0)
  
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"splice_imm"] <- median(per_sample_summ$per_sample_summ,na.rm=T)
  cluster_metrics_tum[cluster_metrics_tum$cancers==cancer,"TMB"] <- median(per_sample_summ$TMB,na.rm=T)

  print(ggplot(per_sample_summ,aes(x=log10(per_sample_summ),y=TMB))+
    geom_point()+
    stat_cor(method = "spearman")+
    geom_smooth(method="lm")+
    labs(title=sprintf(cancer)))
}
## [1] "1 out of 19"
## [1] "BLCA"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "2 out of 19"
## [1] "BRCA"
## Warning: Removed 82 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 82 rows containing non-finite values (stat_smooth).
## Warning: Removed 82 rows containing missing values (geom_point).

## [1] "3 out of 19"
## [1] "CHOL"
## `geom_smooth()` using formula 'y ~ x'

## [1] "4 out of 19"
## [1] "COAD"
## `geom_smooth()` using formula 'y ~ x'

## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## `geom_smooth()` using formula 'y ~ x'

## [1] "8 out of 19"
## [1] "KICH"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## `geom_smooth()` using formula 'y ~ x'

## [1] "11 out of 19"
## [1] "LIHC"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "12 out of 19"
## [1] "LUAD"
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

## [1] "13 out of 19"
## [1] "LUSC"
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## Warning: Removed 8 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).

## [1] "17 out of 19"
## [1] "READ"
## `geom_smooth()` using formula 'y ~ x'

## [1] "18 out of 19"
## [1] "THCA"
## Warning: Removed 31 rows containing non-finite values (stat_cor).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 31 rows containing non-finite values (stat_smooth).
## Warning: Removed 31 rows containing missing values (geom_point).

## [1] "19 out of 19"
## [1] "UCEC"
## `geom_smooth()` using formula 'y ~ x'

ggplot(cluster_metrics_tum,aes(x=log10(splice_imm+1),y=TMB,label=cancers))+
  geom_text()+
  stat_cor(method = "spearman")+
  xlab("splicing antigenicity")
## Warning: Removed 5 rows containing non-finite values (stat_cor).
## Warning: Removed 5 rows containing missing values (geom_text).

splice maf heatmaps

splice_maf_data <- splice_maf@data
splice_maf_data <- splice_maf_data %>% dplyr::filter(Hugo_Symbol %in% splicing_factor_genes)
splice_maf_data <- splice_maf_data[,c("Hugo_Symbol","Variant_Classification","Tumor_Sample_Barcode")]
splice_maf_data$Tumor_Sample_ID <- vapply(TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),sample=T),function(val){substr(val,1,nchar(val)-1)},character(1))
splice_maf_data$participant_ID <- TCGAbarcode(as.character(splice_maf_data$Tumor_Sample_Barcode),participant=T)
splice_maf_data$cancer <- NA
splice_maf_data$splice_ant <- NA
splice_maf_data_fill <- splice_maf_data
splice_maf_data_fill$cancer <- as.character(splice_maf_data_fill$cancer)
splice_maf_data_fill$splice_ant <- as.numeric(splice_maf_data_fill$splice_ant)


junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")
# TMB<-list()

optitype_calls <- read.table("/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv",header=T,sep=",")
which_are_correct<-vapply(optitype_calls$aliquot_id,function(ID){
  a<-strsplit(ID,"-")[[1]]
    if (nchar(a[1]) > 4){
      return(F)  
    } else {
      return(T)
    }
},logical(1))
optitype_calls_correct <- optitype_calls[which_are_correct,]
optitype_calls_correct$particpant_ID <- TCGAbarcode(optitype_calls_correct$aliquot_id,participant=T)

cluster_metrics_tum <- data.frame(cancers)

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}

  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  
  summ_file <- read.table(sprintf("%s/summaries.txt",tumor_dir))
  summ_file <- summ_file$V1
  per_sample_summ <- c()
  for (file in summ_file){
    summ <- read.table(file)
    per_sample_summ <- c(per_sample_summ,summ$V1)
  }
  per_sample_summ <- data.frame(per_sample_summ,tumor_geno$sample_id,tumor_geno$type)
  colnames(per_sample_summ) <- c("per_sample_summ","sample_id","type")
  per_sample_summ <- unique(per_sample_summ)
  
  aa <- vapply(seq(nrow(per_sample_summ)),function(row_val){
    ID <- per_sample_summ$sample_id[row_val]
    splice_ant <- per_sample_summ$per_sample_summ[row_val]
    
    locs <- which(splice_maf_data_fill$Tumor_Sample_ID == ID)
    splice_maf_data_fill[locs,"cancer"] <<- cancer
    splice_maf_data_fill[locs,"splice_ant"] <<- splice_ant
    return(T)
  },logical(1))
  
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
splice_maf_data_fill <- splice_maf_data_fill[complete.cases(splice_maf_data_fill),]
splice_maf_data_fill <- splice_maf_data_fill %>% dplyr::filter(splice_ant > 0)

mutation heatmap

sample_cancer_dat <- unique(splice_maf_data_fill[,c("Tumor_Sample_ID","cancer","splice_ant")])
a<-lapply(sample_cancer_dat$Tumor_Sample_ID,function(ID){
  splice_maf_data_fill_small <- splice_maf_data_fill %>% dplyr::filter(Tumor_Sample_ID == ID)
  vapply(splicing_factor_genes,function(gene){
    count <- length(which(splice_maf_data_fill_small$Hugo_Symbol == gene))
  },numeric(1))
})
sample_splice_factor_dat <- data.frame(matrix(unlist(a),nrow=nrow(sample_cancer_dat),byrow=T))
rownames(sample_splice_factor_dat) <- sample_cancer_dat$Tumor_Sample_ID
colnames(sample_splice_factor_dat) <- splicing_factor_genes

sample_splice_factor_dat_muts <- sample_splice_factor_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
sample_cancer_dat_muts <- sample_cancer_dat[which(apply(sample_splice_factor_dat,1,sd) > 0),]
cancer_order <- order(sample_cancer_dat_muts$cancer)
sample_splice_factor_dat_muts <- sample_splice_factor_dat_muts[cancer_order,]
sample_cancer_dat_muts <- sample_cancer_dat_muts[cancer_order,]
Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=T,
        cluster_columns=T)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

Heatmap(t(scale(t(sample_splice_factor_dat_muts))),
        right_annotation = rowAnnotation(spliceant = anno_barplot(sample_cancer_dat_muts$splice_ant)),
        left_annotation = rowAnnotation(cancer = sample_cancer_dat_muts$cancer),
        show_row_names=F,
        show_column_names = F,
        cluster_rows=F,
        cluster_columns=T)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

sample_splice_factor_dat_muts summarized

sample_cancer_dat_muts$sum <- apply(sample_splice_factor_dat_muts,1,sum)
for (i in unique(sample_cancer_dat_muts$cancer)){
  specific_cancer <- sample_cancer_dat_muts %>% dplyr::filter(cancer == i)
  print(ggplot(specific_cancer,aes(x=log10(splice_ant+1),y=log2(sum+1)))+geom_point()+labs(title=i))
}

ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log2(sum+1)))+geom_boxplot()

ggplot(sample_cancer_dat_muts,aes(x=cancer,y=log10(splice_ant+1)))+geom_boxplot()

ggplot(sample_cancer_dat_muts,aes(x=log10(splice_ant+1),y=log10(sum+1)))+
  geom_point()+
  stat_cor(method = "spearman")+
    geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

Determining per gene metrics for each cancer type

# splicemutr data needed per cancer type
# per-line counts needed per cancer type
# only keep those proteins that are longer than 9 kmers

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")

for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  
  splice_dat_file <- sprintf("%s/%s_splicemutr_dat.txt",tumor_dir,cancer)
  splice_dat <- read.table(splice_dat_file,header=T,sep="\t")
  tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
  tumor_geno <- read.table(tumor_geno_file,header=T)
  summary_file <- sprintf("%s/summaries.txt",tumor_dir)
  summaries <- read.table(summary_file)
  summaries <- summaries$V1
  summaries<-unname(vapply(summaries,function(summ){
    str_replace(summ,"kmers_summary","persamp_line")
  },character(1)))
  
  for (summ in seq(length(summaries))){
    if (summ == 1){
      summaries_combined <- read.table(summaries[summ],header=F,sep="\t")
      if (length(summaries)>1){
        summaries_combined <- summaries_combined[,seq(ncol(summaries_combined)-2)]
      }
    } else if (summ == length(summaries)){
      summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
      summaries_combined <- cbind(summaries_combined,summaries_fill)
    } else {
      summaries_fill <- read.table(summaries[summ],header=F,sep="\t")
      summaries_fill <- summaries_fill[,seq(ncol(summaries_fill)-2)]
      summaries_combined <- cbind(summaries_combined,summaries_fill)
    }
  }
  
  sample_types <- sprintf("%s_%s",tumor_geno$sample_id,tumor_geno$type)
  sample_types<-c(sample_types,"row","cluster")
  colnames(summaries_combined)<-sample_types
  tumor_cols <- which(str_detect(sample_types,"_T"))
  summaries_combined <- summaries_combined[,c(tumor_cols,length(sample_types)-1,length(sample_types))]
    
  splice_dat_specific <- splice_dat[summaries_combined$row,]
  rows_to_keep <- which(!is.na(splice_dat_specific$peptide))
  
  summaries_combined <- summaries_combined[rows_to_keep,]
  summaries_combined[is.na(summaries_combined)]<-0
  splice_dat_specific <- splice_dat_specific[rows_to_keep,]
  rows_to_keep <- !(splice_dat_specific$verdict == "annotated" & splice_dat_specific$modified == "changed")
  summaries_combined <- summaries_combined[rows_to_keep,]
  splice_dat_specific <- splice_dat_specific[rows_to_keep,]
  med_summaries_combined <- apply(summaries_combined[,seq(ncol(summaries_combined)-2)],1,median)
  av_summaries_combined <- apply(summaries_combined[,seq(ncol(summaries_combined)-2)],1,mean)
  splice_dat_specific$median_binders_tumor <- med_summaries_combined
  splice_dat_specific$mean_binders_tumor <- av_summaries_combined
  
  saveRDS(splice_dat_specific,file=sprintf("%s/%s_splice_dat_specific.rds",tumor_dir,cancer))
  
  genes <- unique(splice_dat_specific$gene)
  per_gene_data <- data.frame(genes)
  median_binders <- vapply(genes,function(g){
    splice_dat_small <- splice_dat_specific %>% dplyr::filter(gene == g)
    sum(splice_dat_small$median_binders_tumor)
  },numeric(1))
  per_gene_data$median_binders <- median_binders
  ann_prop <- vapply(genes,function(g){
    splice_dat_small <- splice_dat_specific %>% dplyr::filter(gene == g)
    length(which(splice_dat_small$verdict == "annotated"))/nrow(splice_dat_small)
  },numeric(1))
  per_gene_data$ann_prop <- ann_prop
  
  saveRDS(per_gene_data,file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
  
}

Compiling per-gene metrics

tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
nogos <- c("ESCA","MESO","PAAD","KIRC","GBM")

genes <- c()
for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  
  
  per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
  genes <- unique(c(genes,per_gene_data$genes))
  
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
per_gene_data_tot_binders <- data.frame(genes)
per_gene_data_tot_prop <- data.frame(genes)

rownames(per_gene_data_tot_binders) <- per_gene_data_tot_binders$genes
rownames(per_gene_data_tot_prop) <- per_gene_data_tot_prop$genes

cancers <- c()
for (i in seq(nrow(tumor_data))){
  print(sprintf("%d out of %d",i,nrow(tumor_data)))
  tumor_dir <- tumor_data[i,]
  cancer <- basename(tumor_dir)
  print(cancer)
  if (cancer %in% nogos){next}
  cancers <- c(cancers,cancer)
  
  per_gene_data<-readRDS(file=sprintf("%s/%s_per_gene_data.rds",tumor_dir,cancer))
  rownames(per_gene_data)<-per_gene_data$genes
  per_gene_data_tot_binders[,cancer]<- -1
  per_gene_data_tot_prop[,cancer]<- -1
  per_gene_data_tot_binders[per_gene_data$genes,cancer] <- per_gene_data$median_binders
  per_gene_data_tot_prop[per_gene_data$genes,cancer] <- per_gene_data$ann_prop
  
}
## [1] "1 out of 19"
## [1] "BLCA"
## [1] "2 out of 19"
## [1] "BRCA"
## [1] "3 out of 19"
## [1] "CHOL"
## [1] "4 out of 19"
## [1] "COAD"
## [1] "5 out of 19"
## [1] "ESCA"
## [1] "6 out of 19"
## [1] "GBM"
## [1] "7 out of 19"
## [1] "HNSC"
## [1] "8 out of 19"
## [1] "KICH"
## [1] "9 out of 19"
## [1] "KIRC"
## [1] "10 out of 19"
## [1] "KIRP"
## [1] "11 out of 19"
## [1] "LIHC"
## [1] "12 out of 19"
## [1] "LUAD"
## [1] "13 out of 19"
## [1] "LUSC"
## [1] "14 out of 19"
## [1] "MESO"
## [1] "15 out of 19"
## [1] "PAAD"
## [1] "16 out of 19"
## [1] "PRAD"
## [1] "17 out of 19"
## [1] "READ"
## [1] "18 out of 19"
## [1] "THCA"
## [1] "19 out of 19"
## [1] "UCEC"
row_ann <- unname(vapply(genes,function(gene){
  if (str_detect(gene,"-")){
    return("Fusion")
  } else {
    return("Single")
  }
},character(1)))
row_ann <- data.frame(row_ann)
rownames(row_ann)<-genes

per_gene_data_tot_binders <- per_gene_data_tot_binders[,seq(2,ncol(per_gene_data_tot_binders))]
per_gene_data_tot_prop <- per_gene_data_tot_prop[,seq(2,ncol(per_gene_data_tot_prop))]

per_gene_comp <- per_gene_data_tot_binders*per_gene_data_tot_prop

Heatmap(log10(per_gene_comp+1),
        right_annotation = rowAnnotation(df=row_ann),
        show_row_names=F,
        show_column_names = T,
        cluster_rows=T,
        cluster_columns=T)
## Warning: The input is a data frame, convert it to the matrix.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.